AutoTel Unboxing
AutoTel Analysis (Work-in-Progress)¶
TODO:
- Geocode adresses using Google Maps API - DONE
- Calculate distances using Mapbox Direction API - Started (Not good, maybe try Google Directions API)
- Find too short distances (depends on latter)
- Geocode designated parking spaces and mark on map
- Create holoviews map with date and time sliders, distance and minute show somehow
BIG TODO: somehow host or embed HoloViews+Bokeh dynamic maps
In [2]:
import numpy as np
import pandas as pd
import holoviews as hv
import qgrid
import seaborn as sns
#qgrid.enable()
%matplotlib inline
hv.extension('bokeh', 'matplotlib', width="100")
%opts Curve [width=600 height=250 tools=['hover'] ] {+framewise} VLine (color="black")
%opts Bars [width=800 height=400 tools=['hover'] group_index=1 legend_position='top_left' xrotation=90]
In [3]:
df = pd.read_excel('data/ניתוח נסיעות.xlsx', sheet_name='Sheet1', skiprows=2, usecols=8, parse_dates=True,
names = ['vehicle_id', 'order_id', 'shiryun_cat', 'avg_dist',
'billing_minutes', 'orig_addr', 'dest_addr', 'start_dt', 'end_dt'])
In [4]:
df = (df[~df.shiryun_cat.isin(['Canceled', 'NO SHOW', 'Self Service'])]
.assign(kmh = lambda x: x.avg_dist/(x.billing_minutes/60))
)
In [ ]:
df.sort_values(by='kmh', ascending=False).head(10)
In [6]:
df[df.kmh>60].shape
Out[6]:
In [ ]:
df[df.kmh<0.05].head()
In [8]:
df.set_index('start_dt').resample('D')[['billing_minutes', 'avg_dist']].sum().plot(figsize=[10,7], title='Daily sums')
Out[8]:
In [9]:
df.set_index('start_dt').resample('D')[['billing_minutes', 'avg_dist']].mean().plot(figsize=[10,7], title='Daily means')
Out[9]:
In [10]:
size = df[(df.billing_minutes<100) & (df.avg_dist<100) & (df.kmh<50) & (df.kmh>0)].set_index('start_dt').resample('D').size()
size.plot(figsize=[10,7], title='Daily trip count')
Out[10]:
In [11]:
ewm7 = df[(df.billing_minutes<100) & (df.avg_dist<100) & (df.kmh<50) & (df.kmh>0)].set_index('start_dt').resample('D').size().ewm(span=7).mean()
ewm30 = df[(df.billing_minutes<100) & (df.avg_dist<100) & (df.kmh<50) & (df.kmh>0)].set_index('start_dt').resample('D').size().ewm(span=30).mean()
pd.concat([size, ewm7, ewm30], axis=1, keys=['Daily count', '7 day avg', '30 day avg']).plot(figsize=[10,7], title='Daily trip count (with rolling window averages)')
Out[11]:
Too short¶
In [12]:
under_1min = df[df.billing_minutes<=1].set_index('start_dt').resample('D').size()
under_1km = df[df.avg_dist<=1].set_index('start_dt').resample('D').size()
pd.concat([under_1min, under_1km], axis=1, keys=['Under 1 minute', 'Under 1 km']).plot(figsize=[10,7], title='Too short')
Out[12]:
Too long¶
In [13]:
df.plot(kind='scatter', x='billing_minutes', y='avg_dist', figsize=[10,7])
Out[13]:
In [14]:
import altair as alt
df[(df.billing_minutes<300) & (df.avg_dist<300)].plot(kind='scatter', x='billing_minutes', y='avg_dist', figsize=[10,7],)
Out[14]:
Categorize by minutes and kms¶
- bin using .qcut() and then plot
- make a heatmap
Daily means¶
In [15]:
ax = df[(df.billing_minutes<200) & (df.avg_dist<200)].set_index('start_dt').resample('D')[['billing_minutes', 'avg_dist']].mean().ewm(span=7).mean().plot(figsize=[10,7], title='Daily means')
ax.get_figure().savefig('means.png')
Medians¶
In [16]:
ax = df[(df.billing_minutes<200) & (df.avg_dist<200)].set_index('start_dt').resample('D')[['billing_minutes', 'avg_dist']].median().ewm(span=7).mean().plot(figsize=[10,7], title='Daily medians (smoothed)')
ax.get_figure().savefig('medians.png')
Monthly boxplots¶
In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(10,7))
dfm = df[(df.billing_minutes<100) & (df.avg_dist<100) & (df.kmh<50)].assign(month=lambda x: x.start_dt.dt.strftime('%b')).set_index('start_dt').resample('D')
sns.boxplot(x = dfm.month.min(), y=dfm.avg_dist.mean(), ax=ax)
Out[17]:
In [18]:
f, ax = plt.subplots(figsize=(10,7))
sns.boxplot(x = dfm.month.min(), y=dfm.billing_minutes.mean(), ax=ax)
f.savefig('month_minutes_boxplot_mean.png')
In [19]:
f, ax = plt.subplots(figsize=(10,7))
sns.boxplot(x = dfm.month.min(), y=dfm.billing_minutes.median(), ax=ax)
f.savefig('month_minutes_boxplot_median.png')
Geo¶
Start by cleaning up the addresses
In [20]:
df['orig_addr_c'] = df['orig_addr'].str.replace(r'(\d+)-(\d+)', r'\1').str.replace(r'\\', r'')
df['dest_addr_c'] = df['dest_addr'].str.replace(r'(\d+)-(\d+)', r'\1').str.replace(r'\\', r'')
In [21]:
df[['orig_addr', 'orig_addr_c']].head(10)
Out[21]:
In [22]:
origs = pd.concat([df['orig_addr_c'], df['dest_addr_c']]).unique()
origs.shape
Out[22]:
In [23]:
pd.Series(origs[:10])
Out[23]:
In [24]:
import googlemaps
API_KEY = 'AIzaSyDVUvTvTqRCbZH6lhMl6VX3GQ_A5ucPKhc'
gmaps = googlemaps.Client(key=API_KEY)
geocode_result = gmaps.geocode('גורדון 86, תל אביב יפו, ישראל')
geocode_result
Out[24]:
In [25]:
import pickle
#pickle.dump(d, open('data/coordinates_dict.pkl', 'wb'))
with open('data/coordinates_dict.pkl', 'rb') as f:
d = pickle.load(f)
In [26]:
df['orig_coord'] = df['orig_addr_c'].map(d)
df['dest_coord'] = df['dest_addr_c'].map(d)
In [27]:
df.head(10)
Out[27]:
In [28]:
#pickle.dump(df, open('data/df1.pkl', 'wb'))
In [29]:
df[['orig_x', 'orig_y']] = df['orig_coord'].apply(pd.Series)
df[['dest_x', 'dest_y']] = df['dest_coord'].apply(pd.Series)
In [30]:
#pickle.dump(df, open('data/df2.pkl', 'wb'))
In [31]:
df.head()
Out[31]:
Geo binning¶
In [32]:
import geopandas as gpd
import zipfile
n = gpd.read_file('data/tlv_neighborhoods/Neighbourhoods.shp')
In [33]:
n.shape
Out[33]:
In [34]:
n.head()
Out[34]:
In [35]:
n[['geometry']].unary_union
Out[35]:
In [37]:
df[['orig_x', 'orig_y']].head()
Out[37]:
In [38]:
def wgs84_to_web_mercator(df, lon="orig_y", lat="orig_x"):
"""Converts decimal longitude/latitude to Web Mercator format"""
k = 6378137
df[lat+'_merc'] = df[lon] * (k * np.pi/180.0)
df[lon+'_merc'] = np.log(np.tan((90 + df[lat]) * np.pi/360.0)) * k
return df
df = df.pipe(wgs84_to_web_mercator, lon="orig_y", lat="orig_x").pipe(wgs84_to_web_mercator, lon="dest_y", lat="dest_x")
In [39]:
from geopandas import GeoDataFrame, GeoSeries
from shapely.geometry import Point
orig_geometry = [Point(xy) for xy in zip(df.orig_x_merc, df.orig_y_merc)]
orig_gs = GeoSeries(orig_geometry)
In [40]:
orig_gs.head()
Out[40]:
In [42]:
ins = []
for i, nei in n.geometry.iteritems():
ins.append(orig_gs.intersects(nei))
orig_point_ins = pd.concat(ins, axis=1, keys=n.shemshchun.tolist())
orig_point_ins = orig_point_ins.idxmax(axis=1)
orig_point_ins
Out[42]:
In [43]:
dest_geometry = [Point(xy) for xy in zip(df.dest_x_merc, df.dest_y_merc)]
dest_gs = GeoSeries(dest_geometry)
In [44]:
ins = []
for i, nei in n.geometry.iteritems():
ins.append(dest_gs.intersects(nei))
dest_point_ins = pd.concat(ins, axis=1, keys=n.shemshchun.tolist())
dest_point_ins = dest_point_ins.idxmax(axis=1)
dest_point_ins
Out[44]:
In [45]:
df['orig_nei'] = orig_point_ins
df['dest_nei'] = dest_point_ins
In [46]:
#pickle.dump(df, open('df_with_neis.pkl', 'wb'))
Trips that start/end outside of Te-Aviv-Yafo municipal borders¶
In [47]:
df[df.orig_nei.isna()].shape[0]/df.shape[0]
Out[47]:
In [48]:
df[df.dest_nei.isna()].shape[0]/df.shape[0]
Out[48]:
In [49]:
df.groupby(['orig_nei', 'dest_nei']).size().sort_values(ascending=False)
Out[49]:
In [50]:
orig_neis = df.orig_nei.value_counts().reset_index().rename(columns={'index': 'nei', 'orig_nei': 'trips'})
dest_neis = df.dest_nei.value_counts().reset_index().rename(columns={'index': 'nei', 'dest_nei': 'trips'})
In [51]:
import altair as alt
from altair.expr import datum
alt.renderers.enable('notebook')
alt.data_transformers.enable('json')
Out[51]:
In [52]:
alt.Chart(orig_neis).mark_bar().encode(
x=alt.X('nei:N', title='שכונת מוצא'),
y=alt.Y('trips:Q', title='כמות נסיעות'),
color=alt.Color('nei:N', legend=None),
)
Out[52]:
In [53]:
alt.Chart(dest_neis).mark_bar().encode(
x=alt.X('nei:N', title='שכונת יעד'),
y=alt.Y('trips:Q', title='כמות נסיעות'),
color=alt.Color('nei:N', legend=None),
)
Out[53]:
In [54]:
counts = (pd.concat([df.orig_nei.value_counts(), df.dest_nei.value_counts()], axis=1, keys=['as_orig', 'as_dest'])
.stack()
.reset_index()
.rename(columns={'level_0': 'nei', 'level_1': 'as_what', 0: 'trips'}))
In [55]:
counts
Out[55]:
In [56]:
alt.Chart(counts.reset_index()).mark_bar().encode(
x=alt.X('nei:N', title='שכונה'),
y=alt.Y('trips:Q', title='כמות נסיעות'),
color=alt.Color('as_what:N'),
)
Out[56]:
In [57]:
orig_dest_counts = df.groupby(['orig_nei', 'dest_nei']).size().to_frame().unstack().fillna(0)
In [58]:
orig_dest_counts.columns = orig_dest_counts.columns.get_level_values(1)
In [59]:
sorter = orig_dest_counts.sum().sort_values(ascending=False).index.tolist()
sorterIndex = dict(zip(sorter,range(len(sorter))))
In [60]:
orig_dest_counts = orig_dest_counts.reset_index()
orig_dest_counts['rank'] = orig_dest_counts['orig_nei'].map(sorterIndex)
orig_dest_counts = orig_dest_counts.set_index('orig_nei')
In [61]:
orig_dest_counts = orig_dest_counts[sorter+['rank']].sort_values(by='rank', ascending=False).drop('rank', 1)
In [62]:
orig_dest_counts.head()
Out[62]:
In [63]:
viz_df = orig_dest_counts.rename(columns={s:s[::-1] for s in list(orig_dest_counts.columns)}, index = {s:s[::-1] for s in list(orig_dest_counts.columns)})
In [64]:
fig, ax = plt.subplots(figsize=(30,30))
with sns.axes_style("white"):
ax = sns.heatmap(viz_df, robust=True, square=True, cbar=False, ax=ax)
Distance¶
In [65]:
import requests, json
mapbox_access_token = 'pk.eyJ1IjoiY2plciIsImEiOiJjamhvM2pkMzEweWE2MzBzMW1sZTV0djY3In0.WbUMFEwp9GmgLP5NmCrkig'
url="https://api.mapbox.com/directions/v5/mapbox/driving/"
def get_distances(row):
o1 = str(row['orig_x']) +',' + str(row['orig_y'])
o2 = str(row['dest_x']) + ',' + str(row['dest_y'])
x = o1 + ';' + o2
#print(x)
response = requests.get(url+x+'?access_token='+mapbox_access_token)
data = json.loads(response.content)
if response.status_code == 200:
return data['routes'][0]['distance']#, data['routes'][0]['duration'])
else:
return -1
In [66]:
#pickle.dump(df, open('data/df3.pkl', 'wb'))
Holoviews¶
In [67]:
import holoviews as hv, geoviews as gv, dask.dataframe as dd, cartopy.crs as crs
from colorcet import fire
from holoviews.operation.datashader import datashade
options = dict(width=1000, height=600, xaxis=None, yaxis=None, bgcolor='black')
points = hv.Points(df, ['orig_x_merc', 'orig_y_merc'])
autotel_trips = datashade(points, x_sampling=0.8, y_sampling=0.8, cmap=fire).opts(plot=options)
url = 'https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{Z}/{Y}/{X}.jpg'
tiles = gv.WMTS(url, crs=crs.GOOGLE_MERCATOR)
tiles * autotel_trips
Out[67]:
In [68]:
import param, parambokeh
from colorcet import cm_n
from holoviews.streams import RangeXY
url_base='https://server.arcgisonline.com/ArcGIS/rest/services/'
url_suffix = '/MapServer/tile/{Z}/{Y}/{X}.jpg'
opts = dict(width=800,height=800,xaxis=None,yaxis=None,bgcolor='black',show_grid=False)
class AutotelExplorer(hv.streams.Stream):
alpha = param.Magnitude(default=0.35, doc="Alpha value for the map opacity")
sampling = param.Magnitude(default=0.21, doc="Sampling")
colormap = param.ObjectSelector(default=cm_n["fire"], objects=cm_n.values())
location = param.ObjectSelector(default='orig', objects=['orig', 'dest'])
tile_type = param.ObjectSelector(default='World_Street_Map', objects=['World_Imagery', 'World_Street_Map', 'World_Topo_Map', 'World_Physical_Map'])
def make_view(self, x_range, y_range, **kwargs):
tiles = gv.WMTS(url_base+self.tile_type+url_suffix,crs=crs.GOOGLE_MERCATOR)
map_tiles = tiles.options(alpha=self.alpha, **opts)
points = hv.Points(df, [self.location+'_x_merc', self.location+'_y_merc'])
autotel_trips = datashade(points, x_sampling=(self.sampling**2)*1000, y_sampling=(self.sampling**2)*1000, cmap=self.colormap,
dynamic=False, x_range=x_range, y_range=y_range, width=800, height=800)
return map_tiles * autotel_trips
In [69]:
explorer = AutotelExplorer(name="AutoTel Trips")
parambokeh.Widgets(explorer, callback=explorer.event)
hv.DynamicMap(explorer.make_view, streams=[explorer, RangeXY()])
Out[69]:
Times¶
In [70]:
df['start_hour'] = df['start_dt'].dt.hour
df['start_dayofweek'] = df['start_dt'].dt.strftime('%A')
df['start_month'] = df['start_dt'].dt.strftime('%B')
dfh = (df[(df.billing_minutes<100) & (df.avg_dist<100) & (df.kmh<50)]
.set_index('start_dt')
.resample('1H')
.agg({'start_hour': 'min', 'start_dayofweek': 'min', 'start_month': 'min',
'order_id':'count', 'avg_dist': 'mean', 'billing_minutes': 'mean'}))
dfh.head()
Out[70]:
In [71]:
base = alt.Chart(dfh).mark_rect().encode(
x = 'start_hour:O',
y = alt.Y('start_dayofweek:N', sort=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']),
color='mean(order_id):Q',
tooltip='mean(order_id):Q'
)
chart = alt.vconcat()
for month in df.start_month.unique():
chart &= base.transform_filter(datum.start_month == month).properties(title=month)
chart
Out[71]:
In [72]:
base = alt.Chart(dfh).mark_rect().encode(
x = 'start_hour:O',
y = alt.Y('start_dayofweek:N', sort=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']),
color='mean(avg_dist):Q',
tooltip='mean(avg_dist):Q'
)
chart = alt.vconcat()
for month in df.start_month.unique():
chart &= base.transform_filter(datum.start_month == month).properties(title=month)
chart
Out[72]:
In [73]:
base = alt.Chart(dfh[dfh.billing_minutes<40]).mark_rect().encode(
x = 'start_hour:O',
y = alt.Y('start_dayofweek:N', sort=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']),
color=alt.Color('mean(billing_minutes):Q'),
tooltip='mean(billing_minutes):Q'
)
chart = alt.vconcat()
for month in df.start_month.unique()[3:]:
chart &= base.transform_filter(datum.start_month == month).properties(title=month)
chart
Out[73]:
In [74]:
base = alt.Chart(df[['start_dayofweek', 'start_month']]).mark_rect().encode(
x=alt.X('start_dayofweek:N', sort=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']),
y='count()',
color='start_dayofweek:N',
)
total = base.properties(title='Total')
chart = alt.vconcat()
for month in df.start_month.unique():
chart |= base.transform_filter(datum.start_month == month).properties(title=month)
total | chart
Out[74]:
Vehicles¶
In [75]:
df.set_index('start_dt').resample('1D').vehicle_id.nunique().ewm(span=7).mean().plot()
Out[75]:
In [76]:
df.set_index('start_dt').groupby([pd.Grouper(freq='1D'), 'vehicle_id']).size().hist(bins=16)
Out[76]:
In [77]:
df.set_index('start_dt').groupby([pd.Grouper(freq='1D'), 'vehicle_id']).size().median()
Out[77]:
In [78]:
df.set_index('start_dt').groupby([pd.Grouper(freq='1D'), 'vehicle_id']).size().mean()
Out[78]:
In [79]:
df[df.billing_minutes<200].set_index('start_dt').groupby([pd.Grouper(freq='1D'), 'vehicle_id']).billing_minutes.sum().mean()
Out[79]:
In [80]:
df[df.billing_minutes<200].set_index('start_dt').groupby([pd.Grouper(freq='1D'), 'vehicle_id']).billing_minutes.sum().median()
Out[80]:
Comments !